In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
import xgboost
Load only the users with known destination
In [2]:
train_users = pd.read_csv('../cache/train_users.csv')
Replace NaN values with -1.
In [3]:
train_users.fillna(-1, inplace=True)
Select proper X and y. The labels should be encoded into integers to be usable by XGBoost
:
In [4]:
y_train = train_users['country_destination']
train_users.drop(['country_destination', 'id'], axis=1, inplace=True)
x_train = train_users.values
label_encoder = LabelEncoder()
encoded_y_train = label_encoder.fit_transform(y_train)
To use xgboost
models we need a DMatrix
. This can be done with the next command:
In [5]:
train_data = xgboost.DMatrix(x_train, encoded_y_train)
To see the model performance as it advance we are going to define the score function, for this competition is the NDCG5:
In [6]:
def ndcg5_score(preds, dtrain):
labels = dtrain.get_label()
top = []
for i in range(preds.shape[0]):
top.append(np.argsort(preds[i])[::-1][:5])
mat = np.reshape(np.repeat(labels,np.shape(top)[1]) == np.array(top).ravel(),np.array(top).shape).astype(int)
score = np.mean(np.sum(mat/np.log2(np.arange(2, mat.shape[1] + 2)),axis = 1))
return 'ndcg5', score
Finally, we set the model parameters and run the model with 10 fold Cross Validation to check the reliability of the results:
In [7]:
param = {
'max_depth': 10,
'learning_rate': 1,
'n_estimators': 5,
'objective': 'multi:softprob',
'num_class': 12,
'gamma': 0,
'min_child_weight': 1,
'max_delta_step': 0,
'subsample': 1,
'colsample_bytree': 1,
'colsample_bylevel': 1,
'reg_alpha': 0,
'reg_lambda': 1,
'scale_pos_weight': 1,
'base_score': 0.5,
'missing': None,
'silent': True,
'nthread': 4,
'seed': 42
}
num_round = 10
xgboost.cv(param, train_data, num_boost_round=num_round, metrics=['mlogloss'], feval=ndcg5_score)
Out[7]: